{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Genome Processing & Annotation\n",
    "\n",
    "- This notebook generates annotations for the genes present in the MSK IMPACT dataset, in reference to GrCh37 build.\n",
    "\n",
    "IMPACT dataset: \n",
    "\n",
    "For more information regarding collection methods, etc., see: \n",
    "- https://www.mskcc.org/msk-impact\n",
    "- https://datacatalog.mskcc.org/dataset/10438"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/samgould/anaconda3/lib/python3.7/site-packages/pandas/compat/_optional.py:138: UserWarning: Pandas requires version '2.7.0' or newer of 'numexpr' (version '2.6.8' currently installed).\n",
      "  warnings.warn(msg, UserWarning)\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "import csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/samgould/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3267: DtypeWarning: Columns (45,48,88) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  exec(code_obj, self.user_global_ns, self.user_ns)\n"
     ]
    }
   ],
   "source": [
    "filepath = '/Volumes/Sam_G_SSD/2020-06-16-MSK-IMPACT_EDITED.txt'\n",
    "impact_data = pd.read_csv(filepath, sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Hugo_Symbol</th>\n",
       "      <th>Entrez_Gene_Id</th>\n",
       "      <th>Center</th>\n",
       "      <th>NCBI_Build</th>\n",
       "      <th>Chromosome</th>\n",
       "      <th>Start_Position</th>\n",
       "      <th>End_Position</th>\n",
       "      <th>Strand</th>\n",
       "      <th>Consequence</th>\n",
       "      <th>Variant_Classification</th>\n",
       "      <th>...</th>\n",
       "      <th>VARIANT_CLASS</th>\n",
       "      <th>all_effects</th>\n",
       "      <th>amino_acid_change</th>\n",
       "      <th>cDNA_Change</th>\n",
       "      <th>cDNA_position</th>\n",
       "      <th>cdna_change</th>\n",
       "      <th>comments</th>\n",
       "      <th>n_depth</th>\n",
       "      <th>t_depth</th>\n",
       "      <th>transcript</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>BRCA2</td>\n",
       "      <td>675</td>\n",
       "      <td>MSKCC</td>\n",
       "      <td>GRCh37</td>\n",
       "      <td>13</td>\n",
       "      <td>32937315</td>\n",
       "      <td>32937315</td>\n",
       "      <td>+</td>\n",
       "      <td>splice_acceptor_variant</td>\n",
       "      <td>Splice_Site</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>BRCA2</td>\n",
       "      <td>0</td>\n",
       "      <td>MSKCC</td>\n",
       "      <td>37</td>\n",
       "      <td>13</td>\n",
       "      <td>32914437</td>\n",
       "      <td>32914438</td>\n",
       "      <td>+</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>MUTYH</td>\n",
       "      <td>4595</td>\n",
       "      <td>MSKCC</td>\n",
       "      <td>GRCh37</td>\n",
       "      <td>1</td>\n",
       "      <td>45798475</td>\n",
       "      <td>45798475</td>\n",
       "      <td>+</td>\n",
       "      <td>missense_variant</td>\n",
       "      <td>Missense_Mutation</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>BRCA2</td>\n",
       "      <td>675</td>\n",
       "      <td>MSKCC</td>\n",
       "      <td>GRCh37</td>\n",
       "      <td>13</td>\n",
       "      <td>32893302</td>\n",
       "      <td>32893302</td>\n",
       "      <td>+</td>\n",
       "      <td>frameshift_variant</td>\n",
       "      <td>Frame_Shift_Ins</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>BRCA1</td>\n",
       "      <td>0</td>\n",
       "      <td>MSKCC</td>\n",
       "      <td>37</td>\n",
       "      <td>17</td>\n",
       "      <td>41251824</td>\n",
       "      <td>41251825</td>\n",
       "      <td>+</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>422817</th>\n",
       "      <td>SMARCA4</td>\n",
       "      <td>6597</td>\n",
       "      <td>MSKCC</td>\n",
       "      <td>GRCh37</td>\n",
       "      <td>19</td>\n",
       "      <td>11144132</td>\n",
       "      <td>11144132</td>\n",
       "      <td>+</td>\n",
       "      <td>missense_variant</td>\n",
       "      <td>Missense_Mutation</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>422818</th>\n",
       "      <td>BRAF</td>\n",
       "      <td>673</td>\n",
       "      <td>MSKCC</td>\n",
       "      <td>GRCh37</td>\n",
       "      <td>7</td>\n",
       "      <td>140453149</td>\n",
       "      <td>140453149</td>\n",
       "      <td>+</td>\n",
       "      <td>missense_variant</td>\n",
       "      <td>Missense_Mutation</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>422819</th>\n",
       "      <td>NRAS</td>\n",
       "      <td>4893</td>\n",
       "      <td>MSKCC</td>\n",
       "      <td>GRCh37</td>\n",
       "      <td>1</td>\n",
       "      <td>115258747</td>\n",
       "      <td>115258747</td>\n",
       "      <td>+</td>\n",
       "      <td>missense_variant</td>\n",
       "      <td>Missense_Mutation</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>422820</th>\n",
       "      <td>TERT</td>\n",
       "      <td>7015</td>\n",
       "      <td>MSKCC</td>\n",
       "      <td>GRCh37</td>\n",
       "      <td>5</td>\n",
       "      <td>1295521</td>\n",
       "      <td>1295521</td>\n",
       "      <td>+</td>\n",
       "      <td>upstream_gene_variant</td>\n",
       "      <td>5'Flank</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>422821</th>\n",
       "      <td>KRAS</td>\n",
       "      <td>3845</td>\n",
       "      <td>MSKCC</td>\n",
       "      <td>GRCh37</td>\n",
       "      <td>12</td>\n",
       "      <td>25398284</td>\n",
       "      <td>25398284</td>\n",
       "      <td>+</td>\n",
       "      <td>missense_variant</td>\n",
       "      <td>Missense_Mutation</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>422822 rows × 123 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       Hugo_Symbol  Entrez_Gene_Id Center NCBI_Build Chromosome  \\\n",
       "0            BRCA2             675  MSKCC     GRCh37         13   \n",
       "1            BRCA2               0  MSKCC         37         13   \n",
       "2            MUTYH            4595  MSKCC     GRCh37          1   \n",
       "3            BRCA2             675  MSKCC     GRCh37         13   \n",
       "4            BRCA1               0  MSKCC         37         17   \n",
       "...            ...             ...    ...        ...        ...   \n",
       "422817     SMARCA4            6597  MSKCC     GRCh37         19   \n",
       "422818        BRAF             673  MSKCC     GRCh37          7   \n",
       "422819        NRAS            4893  MSKCC     GRCh37          1   \n",
       "422820        TERT            7015  MSKCC     GRCh37          5   \n",
       "422821        KRAS            3845  MSKCC     GRCh37         12   \n",
       "\n",
       "        Start_Position  End_Position Strand              Consequence  \\\n",
       "0             32937315      32937315      +  splice_acceptor_variant   \n",
       "1             32914437      32914438      +                      NaN   \n",
       "2             45798475      45798475      +         missense_variant   \n",
       "3             32893302      32893302      +       frameshift_variant   \n",
       "4             41251824      41251825      +                      NaN   \n",
       "...                ...           ...    ...                      ...   \n",
       "422817        11144132      11144132      +         missense_variant   \n",
       "422818       140453149     140453149      +         missense_variant   \n",
       "422819       115258747     115258747      +         missense_variant   \n",
       "422820         1295521       1295521      +    upstream_gene_variant   \n",
       "422821        25398284      25398284      +         missense_variant   \n",
       "\n",
       "       Variant_Classification  ... VARIANT_CLASS all_effects  \\\n",
       "0                 Splice_Site  ...           NaN         NaN   \n",
       "1                         NaN  ...           NaN         NaN   \n",
       "2           Missense_Mutation  ...           NaN         NaN   \n",
       "3             Frame_Shift_Ins  ...           NaN         NaN   \n",
       "4                         NaN  ...           NaN         NaN   \n",
       "...                       ...  ...           ...         ...   \n",
       "422817      Missense_Mutation  ...           NaN         NaN   \n",
       "422818      Missense_Mutation  ...           NaN         NaN   \n",
       "422819      Missense_Mutation  ...           NaN         NaN   \n",
       "422820                5'Flank  ...           NaN         NaN   \n",
       "422821      Missense_Mutation  ...           NaN         NaN   \n",
       "\n",
       "       amino_acid_change cDNA_Change cDNA_position  cdna_change comments  \\\n",
       "0                    NaN         NaN           NaN          NaN      NaN   \n",
       "1                    NaN         NaN           NaN          NaN      NaN   \n",
       "2                    NaN         NaN           NaN          NaN      NaN   \n",
       "3                    NaN         NaN           NaN          NaN      NaN   \n",
       "4                    NaN         NaN           NaN          NaN      NaN   \n",
       "...                  ...         ...           ...          ...      ...   \n",
       "422817               NaN         NaN           NaN          NaN      NaN   \n",
       "422818               NaN         NaN           NaN          NaN      NaN   \n",
       "422819               NaN         NaN           NaN          NaN      NaN   \n",
       "422820               NaN         NaN           NaN          NaN      NaN   \n",
       "422821               NaN         NaN           NaN          NaN      NaN   \n",
       "\n",
       "        n_depth  t_depth  transcript  \n",
       "0           NaN      NaN         NaN  \n",
       "1           NaN      NaN         NaN  \n",
       "2           NaN      NaN         NaN  \n",
       "3           NaN      NaN         NaN  \n",
       "4           NaN      NaN         NaN  \n",
       "...         ...      ...         ...  \n",
       "422817      NaN      NaN         NaN  \n",
       "422818      NaN      NaN         NaN  \n",
       "422819      NaN      NaN         NaN  \n",
       "422820      NaN      NaN         NaN  \n",
       "422821      NaN      NaN         NaN  \n",
       "\n",
       "[422822 rows x 123 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "impact_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "unique_genes = np.unique(np.asarray(impact_data['Hugo_Symbol']))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Loading in Reference Genome GRCh37\n",
    "\n",
    "- can check for PAM sequences on either side of the mutation (+/- strand?)\n",
    "- Downloading reference sequence from: https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.26/ (assembly GRCh38)\n",
    "- GRCh37: https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.25\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from Bio import SeqIO\n",
    "import gzip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['37', 'GRCh37'], dtype=object)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#MSK IMPACT dataset uses GrCh37 build\n",
    "np.unique(np.asarray(impact_data['NCBI_Build']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "file = '/Volumes/Sam_G_SSD/GRCh37/ncbi-genomes-2022-03-17/GCF_000001405.25_GRCh37.p13_genomic.fna.gz'\n",
    "\n",
    "with gzip.open(file, \"rt\") as handle:\n",
    "    records = list(SeqIO.parse(handle, \"fasta\")) #about 4 Gb in  memory\n",
    "    #records = list that contains sequences split up by chromosome (and intrachromosome splits up to some size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['NC_000001.10 Homo sapiens chromosome 1, GRCh37.p13 Primary Assembly',\n",
       " 'NC_000002.11 Homo sapiens chromosome 2, GRCh37.p13 Primary Assembly',\n",
       " 'NC_000003.11 Homo sapiens chromosome 3, GRCh37.p13 Primary Assembly',\n",
       " 'NC_000004.11 Homo sapiens chromosome 4, GRCh37.p13 Primary Assembly',\n",
       " 'NC_000005.9 Homo sapiens chromosome 5, GRCh37.p13 Primary Assembly',\n",
       " 'NC_000006.11 Homo sapiens chromosome 6, GRCh37.p13 Primary Assembly',\n",
       " 'NC_000007.13 Homo sapiens chromosome 7, GRCh37.p13 Primary Assembly',\n",
       " 'NC_000008.10 Homo sapiens chromosome 8, GRCh37.p13 Primary Assembly',\n",
       " 'NC_000009.11 Homo sapiens chromosome 9, GRCh37.p13 Primary Assembly',\n",
       " 'NC_000010.10 Homo sapiens chromosome 10, GRCh37.p13 Primary Assembly',\n",
       " 'NC_000011.9 Homo sapiens chromosome 11, GRCh37.p13 Primary Assembly',\n",
       " 'NC_000012.11 Homo sapiens chromosome 12, GRCh37.p13 Primary Assembly',\n",
       " 'NC_000013.10 Homo sapiens chromosome 13, GRCh37.p13 Primary Assembly',\n",
       " 'NC_000014.8 Homo sapiens chromosome 14, GRCh37.p13 Primary Assembly',\n",
       " 'NC_000015.9 Homo sapiens chromosome 15, GRCh37.p13 Primary Assembly',\n",
       " 'NC_000016.9 Homo sapiens chromosome 16, GRCh37.p13 Primary Assembly',\n",
       " 'NC_000017.10 Homo sapiens chromosome 17, GRCh37.p13 Primary Assembly',\n",
       " 'NC_000018.9 Homo sapiens chromosome 18, GRCh37.p13 Primary Assembly',\n",
       " 'NC_000019.9 Homo sapiens chromosome 19, GRCh37.p13 Primary Assembly',\n",
       " 'NC_000020.10 Homo sapiens chromosome 20, GRCh37.p13 Primary Assembly',\n",
       " 'NC_000021.8 Homo sapiens chromosome 21, GRCh37.p13 Primary Assembly',\n",
       " 'NC_000022.10 Homo sapiens chromosome 22, GRCh37.p13 Primary Assembly',\n",
       " 'NC_000023.10 Homo sapiens chromosome X, GRCh37.p13 Primary Assembly',\n",
       " 'NC_000024.9 Homo sapiens chromosome Y, GRCh37.p13 Primary Assembly',\n",
       " 'NC_012920.1 Homo sapiens mitochondrion, complete genome']"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#filtering out alternative sequences to only select consensus matches\n",
    "\n",
    "\n",
    "wrong = [\"alternate\", \"unplaced\", \"unlocalized\", \"patch\"]\n",
    "badlist = []\n",
    "for key in wrong:\n",
    "    for i in records:\n",
    "        ii = i.description\n",
    "        if key in ii:\n",
    "            badlist.append(ii)\n",
    "            \n",
    "filtered = []\n",
    "index_list = []\n",
    "for idx, i in enumerate(records):\n",
    "    ii = i.description\n",
    "    if ii not in badlist:\n",
    "        filtered.append(ii)\n",
    "        index_list.append(idx)\n",
    "        \n",
    "filtered\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SeqRecord(seq=Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNN', SingleLetterAlphabet()), id='NC_000020.10', name='NC_000020.10', description='NC_000020.10 Homo sapiens chromosome 20, GRCh37.p13 Primary Assembly', dbxrefs=[])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chromosome = 20\n",
    "#22 (23-1) = X\n",
    "#23 (24-1) = Y\n",
    "#24 (25-1) = mitochondrial DNA\n",
    "records[index_list[chromosome-1]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ref = TCATCG\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Seq('TCATCG', SingleLetterAlphabet())"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "k=902\n",
    "print('ref = ' + impact_data.iloc[[k]]['Reference_Allele'].values[0])\n",
    "#print('al1 = ' + impact_data.iloc[[k]]['Tumor_Seq_Allele1'].values[0])\n",
    "#print('al2 = ' + impact_data.iloc[[k]]['Tumor_Seq_Allele2'].values[0])\n",
    "seq_start = impact_data.iloc[[k]]['Start_Position'].values[0]\n",
    "seq_end = impact_data.iloc[[k]]['End_Position'].values[0]\n",
    "chromosome = impact_data.iloc[[k]]['Chromosome'].values[0]\n",
    "seq1 = records[index_list[int(chromosome)-1]].seq\n",
    "seq1[seq_start-1: seq_end]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Sequence start is offset by 1 (i.e. need to subtract 1 from start position to get true sequence information). "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Accessing genome coordinates from gene name\n",
    "\n",
    "https://medium.com/intothegenomics/annotate-genes-and-genomic-coordinates-using-python-9259efa6ffc2\n",
    "\n",
    "https://blog.liang2.tw/posts/2018/06/gene-annotation-using-gffutils/\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gffutils"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-03-18 11:45:41,842 - INFO - Committing changes: 2619000 features\n",
      "2022-03-18 11:45:43,637 - INFO - Populating features table and first-order relations: 2619443 features\n",
      "2022-03-18 11:45:43,641 - INFO - Creating relations(parent) index\n",
      "2022-03-18 11:45:57,811 - INFO - Creating relations(child) index\n",
      "2022-03-18 11:46:13,210 - INFO - Creating features(featuretype) index\n",
      "2022-03-18 11:46:54,272 - INFO - Creating features (seqid, start, end) index\n",
      "2022-03-18 11:47:16,452 - INFO - Creating features (seqid, start, end, strand) index\n",
      "2022-03-18 11:47:38,770 - INFO - Running ANALYZE features\n"
     ]
    }
   ],
   "source": [
    "#MOVED THESE FILES AND the CREATED DIRECTORY FILE TO EXTERNAL SSD!!!\n",
    "file = '/Users/samgould/Desktop/FSR Lab/2022-03-17/gencode.v19.annotation.gtf_withproteinids'\n",
    "\n",
    "db = gffutils.create_db(\n",
    "    file,\n",
    "    dbfn='gencode_v19.db',\n",
    "    verbose=True,\n",
    "    merge_strategy='error',\n",
    "    disable_infer_transcripts=True,\n",
    "    disable_infer_genes=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('chr1', 243419358, 243663394, '+')"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tx1 = db['ENST00000366541.3']; \n",
    "tx1.chrom, tx1.start, tx1.end, tx1.strand\n",
    "#tx1.attributes.items()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'ENSP00000269305.4'"
      ]
     },
     "execution_count": 106,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tx = db['ENST00000269305.4'];\n",
    "tx.attributes.items()[9][1][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7590856"
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tx.end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7590856"
      ]
     },
     "execution_count": 101,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gene = db['ENSG00000141510.11']\n",
    "gene.end"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Accessing the transcript coordinates for each gene by getting the start, end and strand of each gene.\n",
    "\n",
    "I'm only getting the transcript coordinates, so not the 3' or 5' UTRs right now (need the gene id, which isn't provided here). I'll check if any of the mutations fall outside of these regions as well."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SDCCAG8NO ID\n"
     ]
    }
   ],
   "source": [
    "unique_genes = np.unique(np.asarray(impact_data['Hugo_Symbol']))\n",
    "\n",
    "id_list = []\n",
    "\n",
    "for i in unique_genes:\n",
    "    k = np.asarray(impact_data[impact_data['Hugo_Symbol'] == i]['HGVSc'])\n",
    "    #print(k)\n",
    "    len(k)\n",
    "    for idx, j in enumerate(k):\n",
    "        if type(j)==str:\n",
    "            id_list.append(j)\n",
    "            #print(j)\n",
    "            break\n",
    "                #print(j)\n",
    "        else:\n",
    "            #print('false')\n",
    "            if (idx+1)==len(k):\n",
    "                print(str(i) + 'NO ID')\n",
    "                #missing gene is'SDCCAG8'; id = ENST00000366541.3\n",
    "                id_list.append('ENST00000366541.3')\n",
    "            else:\n",
    "                continue\n",
    "            \n",
    "                \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "594"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(id_list) #good; matches unique_genes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#now splitting before : to remove SNP information\n",
    "tx_ids = [i.split(':')[0] for i in id_list]\n",
    "\n",
    "#and now getting lists of attributes\n",
    "chrom = []\n",
    "tx_start = []\n",
    "tx_end = []\n",
    "gene_start = []\n",
    "gene_end = []\n",
    "strand = []\n",
    "gene_id = []\n",
    "for i in tx_ids:\n",
    "    tx = db[i]\n",
    "    geneid = tx.attributes.items()[0][1][0]\n",
    "    gene_id.append(geneid)\n",
    "    \n",
    "    \n",
    "    #and now properties of gene\n",
    "    gene = db[geneid]\n",
    "    gene_chrom = gene.chrom\n",
    "    genestart = gene.start\n",
    "    geneend = gene.end\n",
    "    \n",
    "    \n",
    "    chrom.append(gene_chrom)\n",
    "    gene_start.append(genestart)\n",
    "    gene_end.append(geneend)\n",
    "    \n",
    "    #and properties of transcript\n",
    "    txstart = tx.start\n",
    "    txend = tx.end\n",
    "    txstrand = tx.strand\n",
    "    \n",
    "    tx_start.append(txstart)\n",
    "    tx_end.append(txend)\n",
    "    strand.append(txstrand)\n",
    "    \n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "#collating everything into a pandas dataframe and saving as csv for future use\n",
    "\n",
    "gene_info = pd.DataFrame(list(zip(unique_genes, gene_id, tx_ids, \n",
    "                          chrom, gene_start, gene_end, tx_start, tx_end, strand)),\n",
    "               columns =['gene', 'gene_id', 'transcript_id', \n",
    "                          'chrom', 'gene_start', 'gene_end', 'transcript_start', \n",
    "                         'transcript_end', 'strand'])\n",
    "\n",
    "#save this \n",
    "#gene_info.to_csv(...)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Loading in gene coordinate information"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>gene</th>\n",
       "      <th>gene_id</th>\n",
       "      <th>transcript_id</th>\n",
       "      <th>chrom</th>\n",
       "      <th>gene_start</th>\n",
       "      <th>gene_end</th>\n",
       "      <th>transcript_start</th>\n",
       "      <th>transcript_end</th>\n",
       "      <th>strand</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ABL1</td>\n",
       "      <td>ENSG00000097007.13</td>\n",
       "      <td>ENST00000318560.5</td>\n",
       "      <td>chr9</td>\n",
       "      <td>133589333</td>\n",
       "      <td>133763062</td>\n",
       "      <td>133710453</td>\n",
       "      <td>133763062</td>\n",
       "      <td>+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AC004906.3</td>\n",
       "      <td>ENSG00000237286.1</td>\n",
       "      <td>ENST00000423194.1</td>\n",
       "      <td>chr7</td>\n",
       "      <td>2983669</td>\n",
       "      <td>2986725</td>\n",
       "      <td>2983669</td>\n",
       "      <td>2986725</td>\n",
       "      <td>+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>AC008738.1</td>\n",
       "      <td>ENSG00000230259.2</td>\n",
       "      <td>ENST00000425420.2</td>\n",
       "      <td>chr19</td>\n",
       "      <td>33790853</td>\n",
       "      <td>33793430</td>\n",
       "      <td>33790853</td>\n",
       "      <td>33793430</td>\n",
       "      <td>-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ACTG1</td>\n",
       "      <td>ENSG00000184009.5</td>\n",
       "      <td>ENST00000575842.1</td>\n",
       "      <td>chr17</td>\n",
       "      <td>79476997</td>\n",
       "      <td>79490873</td>\n",
       "      <td>79477015</td>\n",
       "      <td>79479807</td>\n",
       "      <td>-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>ACVR1</td>\n",
       "      <td>ENSG00000115170.9</td>\n",
       "      <td>ENST00000263640.3</td>\n",
       "      <td>chr2</td>\n",
       "      <td>158592958</td>\n",
       "      <td>158732374</td>\n",
       "      <td>158592958</td>\n",
       "      <td>158731623</td>\n",
       "      <td>-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>589</th>\n",
       "      <td>XRCC2</td>\n",
       "      <td>ENSG00000196584.2</td>\n",
       "      <td>ENST00000359321.1</td>\n",
       "      <td>chr7</td>\n",
       "      <td>152341864</td>\n",
       "      <td>152373250</td>\n",
       "      <td>152343589</td>\n",
       "      <td>152373250</td>\n",
       "      <td>-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>590</th>\n",
       "      <td>YAP1</td>\n",
       "      <td>ENSG00000137693.9</td>\n",
       "      <td>ENST00000282441.5</td>\n",
       "      <td>chr11</td>\n",
       "      <td>101981192</td>\n",
       "      <td>102104154</td>\n",
       "      <td>101981192</td>\n",
       "      <td>102104154</td>\n",
       "      <td>+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>591</th>\n",
       "      <td>YES1</td>\n",
       "      <td>ENSG00000176105.9</td>\n",
       "      <td>ENST00000314574.4</td>\n",
       "      <td>chr18</td>\n",
       "      <td>721588</td>\n",
       "      <td>812547</td>\n",
       "      <td>721748</td>\n",
       "      <td>812239</td>\n",
       "      <td>-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>592</th>\n",
       "      <td>ZFHX3</td>\n",
       "      <td>ENSG00000140836.10</td>\n",
       "      <td>ENST00000268489.5</td>\n",
       "      <td>chr16</td>\n",
       "      <td>72816784</td>\n",
       "      <td>73093597</td>\n",
       "      <td>72816784</td>\n",
       "      <td>73082274</td>\n",
       "      <td>-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>593</th>\n",
       "      <td>ZRSR2</td>\n",
       "      <td>ENSG00000169249.8</td>\n",
       "      <td>ENST00000307771.7</td>\n",
       "      <td>chrX</td>\n",
       "      <td>15808595</td>\n",
       "      <td>15841383</td>\n",
       "      <td>15808595</td>\n",
       "      <td>15841383</td>\n",
       "      <td>+</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>594 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           gene             gene_id      transcript_id  chrom  gene_start  \\\n",
       "0          ABL1  ENSG00000097007.13  ENST00000318560.5   chr9   133589333   \n",
       "1    AC004906.3   ENSG00000237286.1  ENST00000423194.1   chr7     2983669   \n",
       "2    AC008738.1   ENSG00000230259.2  ENST00000425420.2  chr19    33790853   \n",
       "3         ACTG1   ENSG00000184009.5  ENST00000575842.1  chr17    79476997   \n",
       "4         ACVR1   ENSG00000115170.9  ENST00000263640.3   chr2   158592958   \n",
       "..          ...                 ...                ...    ...         ...   \n",
       "589       XRCC2   ENSG00000196584.2  ENST00000359321.1   chr7   152341864   \n",
       "590        YAP1   ENSG00000137693.9  ENST00000282441.5  chr11   101981192   \n",
       "591        YES1   ENSG00000176105.9  ENST00000314574.4  chr18      721588   \n",
       "592       ZFHX3  ENSG00000140836.10  ENST00000268489.5  chr16    72816784   \n",
       "593       ZRSR2   ENSG00000169249.8  ENST00000307771.7   chrX    15808595   \n",
       "\n",
       "      gene_end  transcript_start  transcript_end strand  \n",
       "0    133763062         133710453       133763062      +  \n",
       "1      2986725           2983669         2986725      +  \n",
       "2     33793430          33790853        33793430      -  \n",
       "3     79490873          79477015        79479807      -  \n",
       "4    158732374         158592958       158731623      -  \n",
       "..         ...               ...             ...    ...  \n",
       "589  152373250         152343589       152373250      -  \n",
       "590  102104154         101981192       102104154      +  \n",
       "591     812547            721748          812239      -  \n",
       "592   73093597          72816784        73082274      -  \n",
       "593   15841383          15808595        15841383      +  \n",
       "\n",
       "[594 rows x 9 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filename1 = '/Users/samgould/Desktop/FSR Lab/2022-03-17/gene_info.csv'\n",
    "df1 = pd.read_csv(filename1)\n",
    "df1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
